/*          
    Purpose: Clean demographic variables  
             for adult child respondents (heads/reference
             persons and wives/spouses/cohabitating partners)
             able to be linked to at least one parent.
             Create 3- and 5-year averages of actual income 
             and parental actual income for these respondents.

    Creates: PSID_bothparents_clean.dta
*/

clear 
set more off

cd "$Mydirectory1/1_DataSources/PSID"

**-----------------------------------------------------------------------------**
*                 PART 1: ADULT CHILDREN
**-----------------------------------------------------------------------------**

********************************
** BRING IN DATA AND RESHAPE
********************************

use ./RawData/PSID_raw_indfam.dta, clear
    
    drop age* race* HHlabor* h_w_inc* maritalstatus* selfemployed* union* h_w_acc* grade* agehead* mainocc* intnumber* HHunioncontract* ER* V* laborinc*

* Unique identifier (wide)
    gen son_id = famid*1000 + personnumber
    order son_id, before(famid)
    label var son_id "Child 1968 ID"

*------------------------------------*
* MERGE IN OTHER RELEVANT VARIABLES
*------------------------------------*
* Geographic variables 
    merge 1:1 son_id using ./RawData/geo_headspouse.dta
    drop if _merge==2
    drop _merge

    sort son_id 
    
* Retrospective parental occupations
    merge 1:1 son_id using ./RawData/parentalocc.dta
    drop if _merge==2
    drop _merge
    
* Cross-sectional weights
    merge 1:1 son_id using ./RawData/xsection_weights_info.dta
    drop if _merge==2
    drop _merge 

* Other demographic variables 
    merge 1:1 son_id using ./RawData/extra_demographicvars.dta
    drop if _merge==2
    drop _merge selfemployed* intdate* month_born*

/* PSID FIMS sample of respondents who can be linked 
   to at least one parent */
    merge 1:1 son_id using ./FIMS/FIMSEitherparentKids_SRC_SEO.dta
    keep if _merge==3
    drop _merge
    order son_id father_id mother_id momlink_only link_bothparents

    drop *2017 *2019 

* Reshape to long format 
    #delimit ; 

        keep son_id father_id mother_id famid momlink_only link_bothparents age* relate* race* sex 
             totfaminc* sequencenumber* reggrewup* xsection_weight_* 
             father_occ_* mother_occ_* state_born* num_* follow*
             birthyear* *livedw* spouse_inFU* ageind* 
            ;
        reshape long agehead_ agespouse_ ageind_ relate sequencenumber num_ racehead_ racespouse_
                     totfaminc reggrewup_HD_ reggrewup_SP_ xsection_weight_
                     father_occ_HD_ father_occ_SP_ mother_occ_HD_ mother_occ_SP_
                     state_born_HD_ state_born_SP_ followstatus_ birthyear_ 
                     spouselivedw_bothparents_ spouse_inFU headlivedw_bothparents_
                     ,
                     i(son_id father_id mother_id momlink_only link_bothparents sex) j(year) 
    ;

    #delimit cr

    unique son_id year //Confirmed: unique identifier

    sort son_id year
    order son_id year

    rename *_ * 
    ren num interviewnumber

****************************************
* OTHER PSID-SPECIFIC RESTRICTIONS
****************************************
    
/* (1) Drop all respondents who were 
       non-response by 1997 (use family interview id) */
      //Note: son_id recently sorted
    by son_id: gen miss = (interviewnumber==.) if year>=1997

    by son_id: egen always_missing =min(miss) if year>=1997
    replace always_missing =0 if year<1997
    tab always_missing, m 
    assert totfaminc==. if always_missing==1 

    by son_id: egen drop_nonrep =max(always_missing)
    assert drop_nonrep==1 if always_missing==1
    drop if drop_nonrep==1 

    drop *miss* *nonrep

/* (2) Drop respondents who were NEVER a head/reference person or 
       a wife/spouse/cohabitating partner from 1997 onwards */

       /* Notes: Head/reference person code: 10
                 Spouse/wife/cohabitating partner code: 20,22

                 Will not keep people who were classified 
                 as "legal husband of head" (code=90)
                 or "first-year cohabitor of head" (code=88)
*/

    by son_id: gen nontarget_person = !inlist(relate, 10, 20, 22) 
    by son_id: egen drop_nontarget_person =min(nontarget_person) 

    drop if drop_nontarget_person==1 
    drop nontarget_person drop_nontarget_person

/* (3a) 
       There are some instances of multiple 
       heads/reference persons in a given year 
       for a family. Will recode key demographic 
       variables as "." for the second,
       "mover-out" head/reference person. These 
       heads/reference persons will not be dropped 
       because they may have been the only,
       actual head/reference person in previous 
       survey waves.
*/
    gen taghead = relate==10
    sort year famid interviewnumber 
    by year famid interviewnumber: egen totalheads = total(taghead==1)
    
    foreach var of varlist sex totfaminc state_born_SP-followstatus {
        replace `var'=. if (!inrange(sequencenumber,1,20) & taghead==1 & totalheads==2)
    }

    drop taghead totalheads

/* (3b) 
       There are some instances of multiple 
       spouses/wives/cohabitating partners
       in a given year for a family. Will recode 
       key demographic variables as "." for the 
       second, "mover-out" spouse/wife/cohabitating partner.
       These spouses/wives/cohabitating partners
       will not be dropped because they may have 
       been the only, actual spouse/wife/cohabitating partner
       in previous survey waves.
*/
    gen tagspouse = (relate==20 | relate==22)
    by year famid interviewnumber: egen totalspouses = total(tagspouse==1)

    foreach var of varlist sex totfaminc state_born_SP-followstatus {
        replace `var'=. if (!inrange(sequencenumber,1,20) & tagspouse==1 & totalspouses==2)
    }

    drop tagspouse totalspouses

******************
* WEIGHT
******************
/*Note: Will grab 1997 cross-sectional weight.
        95% of respondents will have a 
        non-missing weight.  
*/
    sort son_id year

    gen wgtavail_1997 = !inlist(xsection_weight,0,.) & year==1997 
    by son_id: egen total_wgtavail_1997 = total(wgtavail_1997==1)
    tab total_wgtavail_1997, m

    gen weight1997_temp = xsection_weight if year==1997
    by son_id: egen weight1997 = max(weight1997_temp) //Note: max ignores missings
    count if weight1997!=. & weight1997!=0

    drop wgtavail_1997 total_wgtavail_1997 


*******************************************************************
* DEMOGRAPHICS
*******************************************************************

/*Note: Region grew up and race need to 
        be coded up first in order to retain 
        as many non-missing responses
        as possible. 
*/

*-----------------------------------*
* (Modal) region where R grew up
*-----------------------------------*

    //Preliminary: clean region grew up variables 
    foreach var of varlist reggrewup_HD reggrewup_SP {
        if "`var'"=="reggrewup_HD" local cond "relate==10"
        if "`var'"=="reggrewup_SP" local cond "inlist(relate,20,22)"

        tab `var' if `cond', m

        replace `var' =. if inlist(`var',0,6,9) & `cond'
        replace `var' =4 if `var'==5 & `cond'

        tab `var' if `cond', m
    } 
    
    by son_id: egen mode_reggrewup_HD = mode(reggrewup_HD) if relate==10
    by son_id: egen mode_reggrewup_SP = mode(reggrewup_SP) if inlist(relate,20,22)

    gen region4_childhood_temp =.
    replace region4_childhood_temp = mode_reggrewup_HD if relate==10
    replace region4_childhood_temp = mode_reggrewup_SP if inlist(relate,20,22)
    tab region4_childhood_temp,m

    /*Note: Some respondents have multiple region grew up 
            values because of conflicting answers from the 
            respondent. Seems to happen when the relationship 
            of the respondent to the head changes (e.g., respondent 
            goes from being the head to the spouse). Small fraction 
            of respondents have this issue. Will take the modal value. */

    by son_id: egen region4_childhood = mode(region4_childhood_temp)

    /*Verify that there's one value of 
      region grew up per id */
    sort son_id region4_childhood
    egen tagregion = tag(son_id region4_childhood)  
    tab tagregion, m

    by son_id: egen totalregion = total(tagregion==1)          
    tab totalregion, m //Confirmed

    tab region4_childhood, m 
    tab region4_childhood_temp,m

    drop mode_* *_temp tag* totalregion 

    *Dummy: respondent grew up in the South
    gen south_merge = (region4_childhood==3) if region4_childhood!=.
    tab south_merge, m 

*------------------------*
* (Modal) race
*------------------------*

    clonevar racehead_orig = racehead
    clonevar racespouse_orig = racespouse

    foreach var of varlist racehead racespouse {

        if "`var'"=="racehead" local cond "relate==10"
        if "`var'"=="racespouse" local cond "inlist(relate,20,22)"

        replace `var' =. if !inlist(`var',1,2) & `cond'

    }

    by son_id: egen mode_racehead = mode(racehead) if relate==10
    tab mode_racehead, m

    by son_id: egen mode_racespouse = mode(racespouse) if inlist(relate,20,22)
    tab mode_racespouse, m

    gen race_temp =. 
    replace race_temp = mode_racehead if relate==10
    replace race_temp = mode_racespouse if inlist(relate,20,22)
    tab race_temp,m

    /*Note: Same issue as before (with region grew up). 
            Will find the modal value of race in 
            instances of conflicting answers.
    */

    by son_id: egen race = mode(race_temp)

    /*Verify that there's one value of 
      race per id */
    sort son_id race
    egen tagrace = tag(son_id race)  
    tab tagrace, m

    by son_id: egen totalrace = total(tagrace==1)          
    tab totalrace, m //confirmed 

    tab race, m
    tab race_temp, m

    drop mode_* *_orig *_temp tag* totalrace

*------------------------*    
* Age
*------------------------*
/* Note: Will use age-individual variables
         because they seem to be of a better
         quality than the age-head or age-spouse
         variables. 
*/
    foreach var of varlist ageind {
        if "`var'"=="ageind" local cond ""
        replace `var' =. if inlist(`var',999,0) `cond'    
    }

    ren ageind age

    gen agesq = age*age 

    //IMPORTANT: RESTRICT AGE
                 /*Note: This restriction will cause 
                         the panel to be unbalanced. */   
    keep if inrange(age,30,50)

*------------------------*
* Whether foreign-born
*------------------------*

    //Preliminary: clean state born variables 
    foreach var of varlist state_born_HD state_born_SP  {
        if "`var'"=="state_born_HD" local cond "relate==10"
        if "`var'"=="state_born_SP" local cond "inlist(relate,20,22)"

        replace `var' =. if inlist(`var',99) & `cond'      
    } 

    gen foreignborn =.
    replace foreignborn = (state_born_HD==0) if relate==10 & state_born_HD<.
    replace foreignborn = (state_born_SP==0) if inlist(relate,20,22) & state_born_SP<. & spouse_inFU==1 & foreignborn==.
    tab foreignborn, m

    /*Give the respondent the same value for foreignborn 
      in all years */
    by son_id: egen total_foreignborn = total(foreignborn==1) 

    /*One observation (son_id = 5757034) has conflicting 
      information for state born in two years. Will keep,
      recode as ".", and flag. */
        replace foreignborn =. if son_id == 5757034
        replace total_foreignborn =0 if  son_id == 5757034
        gen flag_foreignborn = (son_id==5757034)

    //IMPORTANT: RESTRICT TO NATIVE-BORN 
    keep if total_foreignborn==0 

    /*Note: (1) No need to create a "modal" foreign-born 
                variable. With one exception, no conflict 
                in info between 2013 and 2015 responses. 
    */

*------------------------*
* BIRTH YEAR 
*------------------------*

    //Preliminary: clean birth year
    replace birthyear =. if birthyear==9999
    /*Note: All "0" birth year responses are
            dropped after all restrictions are 
            made, so no need to clean birth year 
            further. */

    //Verify: birth year is constant for each son_id
    egen tagby = tag(son_id birthyear)
    by son_id: egen totalby = total(tagby==1)
    tab totalby, m

    drop tagby totalby
    
    /* Note: There are a couple instances of multiple 
             birth years. Will take the mode. */
    ren birthyear birthyear_orig

    by son_id: egen birthyear = mode(birthyear_orig)
    tab birthyear,m
    tab birthyear_orig, m

    egen tagby = tag(son_id birthyear)
    by son_id: egen totalby = total(tagby==1)
    tab totalby, m //Confirmed: 1 birth year per son_id

    drop tagby totalby   


**********************************************
* PARENTAL OCCUPATION (RETROSPECTIVE, MODAL)
**********************************************
    
* Step 1: crosswalk 
  /* Note: 1970 Census codes for survey waves 1997-2001, 
           2000 Census codes for survey waves 2003-2015. */

    foreach var in father_occ_HD mother_occ_HD father_occ_SP mother_occ_SP  {

        if "`var'"=="father_occ_HD" | "`var'"=="mother_occ_HD" local shortname "HD" 
        if "`var'"=="father_occ_SP" | "`var'"=="mother_occ_SP" local shortname "SP"

        gen census1970 = `var' if inrange(year,1997,2001)
        label var census1970 "==`var' (renamed to facilitate a merge)"

        gen census2000 = `var' if inrange(year,2003,2015)
        label var census2000 "==`var' (renamed to facilitate a merge)"

        * Bring in crosswalk
        foreach it in 1970 2000 {

            if `it'==1970 local lab1 ""
            if `it'==2000 local lab1  "_2000"
        
            preserve

                use ../Crosswalks/Crosswalk_`it'Census_toANES.dta, clear
                
                if "`var'"=="father_occ_HD" | "`var'"=="father_occ_SP"  {
                    ren fatheroccej`lab1' fatheroccej_`shortname'_`it'
                    label var fatheroccej_`shortname'_`it' "father occ (`shortname'), `it', coarsened"
                }
                if "`var'"=="mother_occ_HD" | "`var'"=="mother_occ_SP"  {
                    ren fatheroccej`lab1' motheroccej_`shortname'_`it'
                    label var motheroccej_`shortname'_`it' "mother occ (`shortname'), `it', coarsened"
                }

                if `it'==2000 ren occ2000 census2000
                
                sort census`it'
                tempfile crossw
                save `crossw'

            restore

            sort census`it'
            merge m:1 census`it' using `crossw' 

            assert inlist(census`it',999,0,.) if _merge==1
            tab census`it' if _merge==1, m
            count if (census`it'==0 | census`it'==.) & _merge==3
            tab census`it' if _merge==3, m
            drop if _merge==2
            drop _merge
            drop census`it'

        }
    }

/* Step 2: Combine 1970 and 2000 crosswalks to create
           4 harmonized variables:
           (1) dad occ, HD; 
           (2) dad occ, SP; 
           (3) mom occ, HD; 
           (4) mom occ, SP
*/
    foreach name in fatheroccej_HD motheroccej_HD fatheroccej_SP motheroccej_SP  {
        if "`name'"=="fatheroccej_HD" | "`name'"=="motheroccej_HD" local cond "relate==10"
        if "`name'"=="fatheroccej_SP" | "`name'"=="motheroccej_SP" local cond "inlist(relate,20,22)"

        gen `name'_temp =.
        replace `name'_temp = `name'_1970 if inrange(year,1997,2001)  & `cond' 
        replace `name'_temp = `name'_2000 if inrange(year,2003,2015)  & `cond' 
        tab `name'_temp if `cond', m
        tab `name'_temp, m 
    }

/* Step 3: find modal father occupation and 
           modal mother occupation for each 
           respondent
*/
    sort son_id

    by son_id: egen mode_fatheroccej_HD = mode(fatheroccej_HD_temp) if relate==10
    tab mode_fatheroccej_HD, m

    by son_id: egen mode_fatheroccej_SP = mode(fatheroccej_SP_temp) if inlist(relate,20,22)
    tab mode_fatheroccej_SP, m

    by son_id: egen mode_motheroccej_HD = mode(motheroccej_HD_temp) if relate==10
    tab mode_motheroccej_HD, m

    by son_id: egen mode_motheroccej_SP = mode(motheroccej_SP_temp) if inlist(relate,20,22)
    tab mode_motheroccej_SP, m


    gen fatheroccej_temp =. 
    replace fatheroccej_temp = mode_fatheroccej_HD if relate==10
    replace fatheroccej_temp = mode_fatheroccej_SP if inlist(relate,20,22)
    tab fatheroccej_temp,m

    gen motheroccej_temp =. 
    replace motheroccej_temp = mode_motheroccej_HD if relate==10
    replace motheroccej_temp = mode_motheroccej_SP if inlist(relate,20,22)
    tab motheroccej_temp,m

    /*Note: Some respondents have multiple father 
            occupation and mother occupation values 
            because of conflicting answers from the 
            respondent. Seems to happen when the 
            relationship of the respondent to the head 
            changes (e.g., respondent goes from 
            being the head to the spouse.) Small 
            fraction of respondents have this issue. 
            Will take the modal value (again). */

    by son_id: egen fatheroccej = mode(fatheroccej_temp)
    by son_id: egen motheroccej = mode(motheroccej_temp)

    //Verify: one value of occupation variables per son_id
    foreach var of varlist fatheroccej motheroccej  {
        if "`var'"=="fatheroccej" local parent "dad"
        if "`var'"=="motheroccej" local parent "mom"

        sort son_id `var'
        egen tag`parent' = tag(son_id `var')  
        tab tag`parent', m

        by son_id: egen total`parent' = total(tag`parent'==1)          
        tab total`parent', m //Confirmed
    } 

    drop mode_* tag* totaldad totalmom

    tab fatheroccej, m
    tab motheroccej, m

    /*EXTRA STEP: For respondents with missing 
                  father or mother occupation, assign
                  them the first non-missing answer 
                  that they give. */

    sort son_id age 

    foreach parentocc in fatheroccej motheroccej  {
        gen nonmissing = ((`parentocc'_HD_temp!=. & relate==10) | (`parentocc'_SP_temp!=. & inlist(relate,20,22)))
        egen firstnonmissing = tag(son_id nonmissing) if nonmissing==1
        
        gen firstnonm_value = `parentocc'_HD_temp if firstnonmissing==1 & relate==10
        replace firstnonm_value = `parentocc'_SP_temp if firstnonmissing==1 & inlist(relate,20,22)

        by son_id: egen firstreport = min(firstnonm_value)

        replace `parentocc' =firstreport if `parentocc'==. & firstreport!=. 

        drop *nonmissing firstnonm_value firstreport     
    }


/*Note: From previous PSID work, it has been verified 
        that no info on whether mom/dad was self employed 
        is available (retrospectively).Cannot adjust parental 
        occupation for self employment.*/

*******************************
* FATHER NOT WORKING DUMMY
*******************************

    gen father_notworking_temp =.
    //Note: Agespouse restriction is recommended by the PSID.
    replace father_notworking_temp = 1 if fatheroccej==. & ((father_occ_HD==0 & relate==10) | (father_occ_SP==0 & agespouse!=0 & inlist(relate,20,22))) 
    replace father_notworking_temp = 0 if fatheroccej!=. & father_notworking_temp==.
    tab father_notworking_temp, m

    by son_id: egen father_notworking = mode(father_notworking_temp)

    //Verify: one value of father_notworking per son_id
    sort son_id father_notworking
    egen tagnwork = tag(son_id father_notworking)  
    tab tagnwork, m

    by son_id: egen totalnwork = total(tagnwork==1)          
    tab totalnwork, m //Confirmed

    drop *_temp tagnwork totalnwork

**********************************************
* FAMILY INCOME (ACTUAL, AVERAGED AROUND 40)
**********************************************

*----------------------------------------------------*
/* SUPER IMPORTANT: PUT TOTFAMINC IN 
                    1950$ **BEFORE** AVERAGING */
*----------------------------------------------------*
    
    sort son_id year 
    gen totfaminc_orig = totfaminc 
    replace totfaminc =. if totfaminc<=0 

    gen year_CPI = year-1 
    merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
        drop if _merge==2
        drop _merge

        replace totfaminc =totfaminc*deflator
        label var totfaminc "Total family income (adult R), 1950 dollars"

    drop year_CPI CPI deflator
    sort son_id
  
    global numberyears "3 5" 
    foreach c in $numberyears {
                
        * Center around age 40
        local child_center=40
        local center `child_center'
        
        * Look between 30 and 50
        local child_band=10 
        local band `child_band'
       
        * # of observations that'll be tagged (changes from 3 to 5)
        local ccount=`c' 
        local obs_ct `ccount'

        * Binary: data is okay to use (i.e., there's non-missing income at age 40)
        gen indchild_totfaminc = 0
        replace indchild_totfaminc = 1 if totfaminc!=. & age==`center'
            
        * Find total of this binary for each person
        by son_id: egen total_indc_totfaminc = total(indchild_totfaminc)

    /* Now iteratively search the bands starting 
       at the center. Tag the observation in the sample 
       if it's non-missing and the adult respondent has not 
       reached the the observation count (here, 3 or 5). 
       The total number of observations used is then 
       recalculated. Note that the upper band is arbitrarily 
       privileged. 
    */ 
        forval i = 0/`band' {
            * Upper band (41-50)
            replace indchild_totfaminc = 1 if totfaminc!=. & age==(`center'+`i') & (total_indc_totfaminc<`obs_ct')
            
            * Re-calculate the total observations used
            drop total_indc_totfaminc
            by son_id: egen total_indc_totfaminc = total(indchild_totfaminc)

            * Lower band (30-39)
            replace indchild_totfaminc = 1 if totfaminc!=. & age==(`center'-`i') & total_indc_totfaminc<`obs_ct'
            
            * Re-calculate the total observations used
            drop total_indc_totfaminc
            by son_id: egen total_indc_totfaminc = total(indchild_totfaminc) 
    }

    * Calculate average income (3 or 5 years) using tagged years 
        by son_id: egen mean_totfaminc_tmp_`c'years = mean(totfaminc) if indchild_totfaminc==1
        by son_id: egen mean_totfaminc_`c'years = max(mean_totfaminc_tmp_`c'years) 

        /*IMPORTANT: Replace average as "." for ids without `c' 
                     years of viable income */
        by son_id: replace mean_totfaminc_`c'years =. if total_indc_totfaminc<`obs_ct' 
        label var mean_totfaminc_`c'years "Child's avg. actual income using `c' years and totfaminc"

    ren indchild_totfaminc indchild_totfaminc_`c'years
    ren total_indc_totfaminc total_indc_totfaminc_`c'years 
    }

    drop *indc* *tmp* agehead agespouse 

    tempfile adultchildren
    save `adultchildren'

* Save file of father ids
    preserve
        keep father_id link_bothparents
        sort father_id
        by father_id: keep if _n==1

        save ./output/PSID_fatherids.dta, replace
    restore

/* Save file of mother ids for respondents 
   who can only be linked to a mother */
    preserve
        keep if momlink_only==1
        keep mother_id momlink_only //Note: MUST keep momlink_only in order to correctly merge later
        sort mother_id
        by mother_id: keep if _n==1

        save ./output/PSID_motherids_momlinkonly.dta, replace
    restore

**-----------------------------------------------------------------------------**
*                 PART 2: LINKED PARENTS
**-----------------------------------------------------------------------------**

************
* FATHERS
************

    use ./RawData/PSID_raw_indfam.dta, clear

* Unique identifier (wide)
    gen father_id = famid*1000 + personnumber
    order father_id, before(famid)
    label var father_id "Father 1968 ID"

    keep father_id relate* totfaminc* famid personnumber

/* Keep the fathers of adult child respondent who were 
   asked and gave retrospective answers */
    sort father_id
    merge 1:1 father_id using ./output/PSID_fatherids.dta
    keep if _merge==3
    drop _merge 

*----------------------------------------*
/* Merge in other demographic variables 
   for head/reference person and the 
   wife/spouse/cohabitating partner */
*----------------------------------------*
    
    /*Temporarily rename father_id 
      so that age can be merged in */
    ren father_id son_id

    merge 1:1 son_id using ./RawData/extra_demographicvars.dta
    drop if _merge==2
    keep son_id relate* totfaminc* age* birthyear* link_bothparents
    ren son_id father_id

* Reshape (long) 
    #delimit ; 
        reshape long relate agehead_ agespouse_ ageind_ totfaminc 
                birthyear_, i(father_id link_bothparents) j(year);
    #delimit cr
    
    sort father_id year
    ren *_ *

* Age
/* Note: Will use age-individual variables
   because they seem to be of a better
   quality than the age-head or age-spouse
   variables. 
*/
    foreach var of varlist ageind {
        if "`var'"=="ageind" local cond ""
        replace `var' =. if inlist(`var',999,0) `cond'    
    }

    ren ageind age
    //IMPORTANT: RESTRICT AGE. NEED TO FIND AVERAGE INCOME BETWEEN 30 AND 50
                 /*Note: This restriction will cause 
                         the panel to be unbalanced. */   
    keep if inrange(age,30,50)

**********************************************
* INCOME (ACTUAL, AVERAGED AROUND 40)
**********************************************

*----------------------------------------------------------------*
* SUPER IMPORTANT: PUT TOTFAMINC IN 1950$ **BEFORE** AVERAGING
*----------------------------------------------------------------*

    sort father_id year
    gen totfaminc_orig = totfaminc 
    replace totfaminc =. if totfaminc<=0

    gen year_CPI = year-1 
    merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
        drop if _merge==2
        drop _merge

        replace totfaminc =totfaminc*deflator
        label var totfaminc "Total family income (fathers), 1950 dollars"

    drop year_CPI CPI deflator
    sort father_id

    global numberyears "3 5" 
    foreach c in $numberyears {
                
        * Center around age 40
        local father_center=40
        local center `father_center'

        * Look between 30 and 50
        local father_band=10 
        local band `father_band'

        * # of observations that'll be tagged (changes from 3 to 5)
        local ccount=`c' 
        local obs_ct `ccount'

        * Binary: data is okay to use (i.e., there's non-missing income at age 40)
        gen indfather_totfaminc = 0
        replace indfather_totfaminc = 1 if totfaminc!=. & age==`center'
            
        * Find total of this binary for each person
        by father_id: egen total_indf_totfaminc = total(indfather_totfaminc)

    /* Now iteratively search the bands starting 
       at the center. Tag the observation in the sample 
       if it's non-missing and the adult respondent has not 
       reached the the observation count (here, 3 or 5). 
       The total number of observations used is then 
       recalculated. Note that the upper band is arbitrarily 
       privileged. 
    */ 
        forval i = 0/`band' {
            * Upper band (41-50)
            replace indfather_totfaminc = 1 if totfaminc!=. & age==(`center'+`i') & (total_indf_totfaminc<`obs_ct')
            
            * Re-calculate the total observations used
            drop total_indf_totfaminc
            by father_id: egen total_indf_totfaminc = total(indfather_totfaminc)

            * Lower band (30-39)
            replace indfather_totfaminc = 1 if totfaminc!=. & age==(`center'-`i') & total_indf_totfaminc<`obs_ct'
            
            * Re-calculate the total observations used
            drop total_indf_totfaminc
            by father_id: egen total_indf_totfaminc = total(indfather_totfaminc) 
            
    }

    * Calculate average income (3 or 5 years) using tagged years 
        by father_id: egen mean_fathertotfaminc_tmp_`c'years = mean(totfaminc) if indfather_totfaminc==1
        by father_id: egen mean_fathertotfaminc_`c'years = max(mean_fathertotfaminc_tmp_`c'years) 

        /*IMPORTANT: Replace average as "." for ids without `c' 
                     years of viable income */
        by father_id: replace mean_fathertotfaminc_`c'years =. if total_indf_totfaminc<`obs_ct' 
        label var mean_fathertotfaminc_`c'years "Father's avg. actual income using `c' years and totfaminc"

    ren indfather_totfaminc indfather_totfaminc_`c'years
    ren total_indf_totfaminc total_indf_totfaminc_`c'years
    }

    drop *indf* *tmp* 
    ren birthyear byear_father
    label var byear_father "Father's birth year (PSID var)"

    keep father_id mean* link_bothparents 

    //IMPORTANT: DROP LINKED FATHERS WITHOUT ACTUAL INCOME AVAILABLE
    drop if (mean_fathertotfaminc_3years==. & mean_fathertotfaminc_5years==.)

    sort father_id
    by father_id: keep if _n==1 //Keep 1 obs per father

    tempfile fathers_actualincome
    save `fathers_actualincome'

*------------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------------*

******************************
* MOTHERS (NO FATHER LINKED)
******************************

    use ./RawData/PSID_raw_indfam.dta, clear

* Unique identifier (wide)
    gen mother_id = famid*1000 + personnumber
    order mother_id, before(famid)
    label var mother_id "Mother 1968 ID"

    keep mother_id relate* totfaminc* famid personnumber

/* Keep the mothers of adult child respondent who were 
   asked and gave retrospective answers */
    sort mother_id
    merge 1:1 mother_id using ./output/PSID_motherids_momlinkonly.dta
    keep if _merge==3
    drop _merge 

*----------------------------------------*
/* Merge in other demographic variables 
   for head/reference person and the 
   wife/spouse/cohabitating partner */
*----------------------------------------*
    
    /*Temporarily rename mother_id 
      so that age can be merged in */
    ren mother_id son_id

    merge 1:1 son_id using ./RawData/extra_demographicvars.dta
    drop if _merge==2
    keep son_id relate* totfaminc* age* birthyear* momlink_only
    ren son_id mother_id

* Reshape (long)  
    #delimit ; 
        reshape long relate agehead_ agespouse_ ageind_ 
                totfaminc birthyear_, i(mother_id) j(year);
    #delimit cr
    
    sort mother_id year
    ren *_ *

* Age
/* Note: Will use age-individual variables
   because they seem to be of a better
   quality than the age-head or age-spouse
   variables. 
*/
    foreach var of varlist ageind {
        if "`var'"=="ageind" local cond ""
        replace `var' =. if inlist(`var',999,0) `cond'    
    }

    ren ageind age
    //IMPORTANT: RESTRICT AGE. NEED TO FIND AVERAGE INCOME BETWEEN 30 AND 50
                 /*Note: This restriction will cause 
                         the panel to be unbalanced. */   
    keep if inrange(age,30,50)

**********************************************
* INCOME (ACTUAL, AVERAGED AROUND 40)
**********************************************

*-----------------------------------------------*
/* SUPER IMPORTANT: PUT TOTFAMINC IN 1950$ 
                    **BEFORE** AVERAGING */
*-----------------------------------------------*

    sort mother_id year
    gen totfaminc_orig = totfaminc 
    replace totfaminc =. if totfaminc<=0

    gen year_CPI = year-1 
    merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
        drop if _merge==2
        drop _merge

        replace totfaminc =totfaminc*deflator
        label var totfaminc "Total family income (mothers), 1950 dollars"

    drop year_CPI CPI deflator
    sort mother_id

    global numberyears "3 5" 
    foreach c in $numberyears {
                
        * Center around age 40
        local mother_center=40
        local center `mother_center'

        * Look between 30 and 50
        local mother_band=10 
        local band `mother_band'
        
        * # of observations that'll be tagged (changes from 3 to 5)
        local ccount=`c' 
        local obs_ct `ccount'

        * Binary: data is okay to use (i.e., there's non-missing income at age 40)
        gen indmother_totfaminc = 0
        replace indmother_totfaminc = 1 if totfaminc!=. & age==`center'
            
        * Find total of this binary for each person
        by mother_id: egen total_indm_totfaminc = total(indmother_totfaminc)

    /* Now iteratively search the bands starting 
       at the center. Tag the observation in the sample 
       if it's non-missing and the adult respondent has not 
       reached the the observation count (here, 3 or 5). 
       The total number of observations used is then 
       recalculated. Note that the upper band is arbitrarily 
       privileged. 
    */  
        forval i = 0/`band' {
            * Upper band (41-50)
            replace indmother_totfaminc = 1 if totfaminc!=. & age==(`center'+`i') & (total_indm_totfaminc<`obs_ct')
            
            * Re-calculate the total observations used
            drop total_indm_totfaminc
            by mother_id: egen total_indm_totfaminc = total(indmother_totfaminc)

            * Lower band (30-39)
            replace indmother_totfaminc = 1 if totfaminc!=. & age==(`center'-`i') & total_indm_totfaminc<`obs_ct'
            
            * Re-calculate the total observations used
            drop total_indm_totfaminc
            by mother_id: egen total_indm_totfaminc = total(indmother_totfaminc) 
            
    }

    * Calculate average income (3 or 5 years) using tagged years 
        by mother_id: egen mean_mothertotfaminc_tmp_`c'years = mean(totfaminc) if indmother_totfaminc==1
        by mother_id: egen mean_mothertotfaminc_`c'years = max(mean_mothertotfaminc_tmp_`c'years) 

        /*IMPORTANT: Replace average as "." for ids without `c' 
                     years of viable income */
        by mother_id: replace mean_mothertotfaminc_`c'years =. if total_indm_totfaminc<`obs_ct' 
        label var mean_mothertotfaminc_`c'years "mother's avg. actual income using `c' years and totfaminc"

    ren indmother_totfaminc indmother_totfaminc_`c'years
    ren total_indm_totfaminc total_indm_totfaminc_`c'years
    }

    drop *indm* *tmp* 
    ren birthyear byear_mother
    label var byear_mother "Mother's birth year (PSID var)"

    keep mother_id momlink_only mean* 

    //IMPORTANT: DROP LINKED MOTHERS WITHOUT ACTUAL INCOME AVAILABLE
    drop if (mean_mothertotfaminc_3years==. & mean_mothertotfaminc_5years==.)

    sort mother_id
    by mother_id: keep if _n==1 //Keep 1 obs per mother

    tempfile mothers_actualincome
    save `mothers_actualincome'

**---------------------------------------------------------------------------------------**
*    PART 3: ATTACH ACTUAL PARENTAL INCOME TO ADULT CHILD RESPONDENTS
**---------------------------------------------------------------------------------------**

    use `adultchildren'

    sort father_id
    merge m:1 father_id using `fathers_actualincome'
    drop if _merge==2
    ren _merge _merge_fathers

    sort mother_id momlink_only 
    merge m:1 mother_id momlink_only using `mothers_actualincome'
    drop if _merge==2
    ren _merge _merge_mothers
    /* NOTE: MUST merge on momlink_only as well. 
             Multiple respondents have same mom 
             but different values for momlink_only. 
             Merging only on mom id will give actual 
             mother income to some respondents who 
             can be linked to a father.*/

    //Dummy: Linked to a father with actual income
    gen linked_fatheractincome = (_merge_fathers==3)
    tab linked_fatheractincome,m

    label var linked_fatheractincome "Dummy =1 if R can be linked to a dad with actual income available"

    //Dummy: Linked to a mom or dad with actual income 
    gen linked_parentactincome = (_merge_fathers==3 | _merge_mothers==3)
    tab linked_parentactincome, m

    label var linked_parentactincome "Dummy =1 if R can be linked to a mom or dad with actual income available"

    drop _merge*

****************************************************************
/* SUPER IMPORTANT: KEEP ONLY RESPONDENTS THAT CAN BE 
                    LINKED TO A PARENT WITH ACTUAL INCOME */
****************************************************************
    keep if linked_parentactincome==1

*----------------------------------------------*
/* PUT ALL INCOME MEASURES IN 2015$ 
   (TO MATCH DAVIS AND MAZUMDER EXERCISE)
   BEFORE LOGGING */
*----------------------------------------------*

    /* We turn fam_inc into 2015 dollars using the CPI: 
       https://data.bls.gov/timeseries/CUUR0000SA0 */ 
    gen CPI1950 = 24.1 
    gen CPI2015 = 237.017

    global inc1950 "mean_totfaminc_3years mean_totfaminc_5years mean_fathertotfaminc_3years mean_fathertotfaminc_5years mean_mothertotfaminc_3years mean_mothertotfaminc_5years"

    foreach i of global inc1950 {
        replace `i' = `i' * (CPI2015/CPI1950)
        
        local label: var label `i'
        label var `i' "`label', 2015 dollars"

    }

    drop CPI*

*-----------------------------------------------*
* LOG ALL RELEVANT INCOME MEASURES
*-----------------------------------------------*

    foreach var of varlist mean_totfaminc_3years-mean_mothertotfaminc_5years {
        gen log_`var' = ln(`var')

        local label: var label `var'
        label var log_`var' "(Logged) `label'"
    }

*----------------------
* Save
*----------------------

    drop *_SP* *_HD* interviewnumber-racehead birthyear_orig followstatus total_foreignborn totfaminc_orig age* year 
   
    sort son_id 
    by son_id: keep if _n==1 //keep one obs per adult child respondent
    duplicates report son_id //no duplicates
    compress
    save ./output/PSID_bothparents_clean, replace


